001    /*
002     * FactorSequence.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    import java.io.Reader;
035    import java.io.BufferedReader;
036    import java.io.IOException;
037    
038    /**
039     * This class builds a list of factors of a character sequence as induced by its
040     * Lempel-Ziv (LZ78) factorisation. Each factor is enconded as the longest factor
041     * previously seen plus one character.
042     *
043     * <P>The input can come from any source, provided it is encapsulated in a proper
044     * <CODE>Reader</CODE> instance. The stream is expected to be ready (i.e. the next
045     * <CODE>read</CODE> operation must return the first character of the sequence) and it is
046     * not closed when its end is reached, so the client is allowed to reset it and maybe use
047     * it for another purpose.</P>
048     *
049     * <P>Sequences can contain letters only although lines started with the
050     * <CODE>COMMENT_CHAR</CODE> character ('>') are regarded as comments and are completely
051     * skipped. White spaces (including tabs, line feeds and carriage returns) are also
052     * ignored throughout.</P>
053     *
054     * <P>This class uses a {@linkplain Trie} to keep track of a list of factors. Each node of
055     * the trie contains a {@linkplain Factor} of the text. As the sequence is read from the
056     * input, the trie is traversed as far as possible. When a leaf node is reached (which
057     * means that the longest prefix of the input has been found), two tasks are
058     * accomplished:</P>
059     *
060     * <UL>
061     * <LI>a new <CODE>Factor</CODE> is created with the character at the current position of
062     * the input and the leaf node's factor;
063     * <LI>a new node is added to the trie with the character at the current position of the
064     * input;
065     * </UL>
066     *
067     * <P>Each factor also receives a serial number according to the order they are found and
068     * a pointer to the next factor (in that order) for fast access. This pointer, together
069     * with the factor's ancestor pointer forms a doubly-linked list of factors. The original
070     * text can then be reconstructed simply by following the linked list and writing out its
071     * factors.</P>
072     *
073     * <P>As an example, the sequence <CODE>ACTAAACCGCATTAATAATAAAA</CODE> is parsed into the
074     * following 12 factors:</P>
075     *
076     * <CODE><BLOCKQUOTE><PRE>
077     * 0  ( , ) = empty
078     * 1  (0,A) = A
079     * 2  (0,C) = C
080     * 3  (0,T) = T
081     * 4  (1,A) = AA
082     * 5  (1,C) = AC
083     * 6  (2,G) = CG
084     * 7  (2,A) = CA
085     * 8  (3,T) = TT
086     * 9  (4,T) = AAT
087     * 10 (9,A) = AATA
088     * 11 (4,A) = AAA
089     *
090     * serial # (prefix, new char) = factor text
091     * </PRE></BLOCKQUOTE></CODE>
092     *
093     * <P>This class is used by {@linkplain CrochemoreLandauZivUkelson} algorithm to speed up
094     * the classic dynamic programming approach to sequence alignment.</P>
095     *
096     * @author Sergio A. de Carvalho Jr.
097     * @see Factor
098     * @see Trie
099     * @see CrochemoreLandauZivUkelson
100     */
101    public class FactorSequence
102    {
103            /**
104             * The character used to start a comment line in a sequence file. When this character
105             * is found, the rest of the line is ignored.
106             */
107            protected static final char COMMENT_CHAR = '>';
108    
109            /**
110             * A pointer to the root factor, the one that starts the list of factors.
111             */
112            protected Factor root_factor;
113    
114            /**
115             * The numbers of character represented by this sequence.
116             */
117            protected int num_chars;
118    
119            /**
120             * The numbers of factors generated by the LZ78 parsing of the sequence.
121             */
122            protected int num_factors;
123    
124            /**
125             * Creates a new instance of a <CODE>FactorSequence</CODE>, loading the sequence data
126             * from the <CODE>Reader</CODE> input stream. A doubly-linked list of factors is built
127             * according to its LZ78 factorisation.
128             *
129             * @param reader source of characters for this sequence
130             * @throws IOException if an I/O exception occurs when reading the input
131             * @throws InvalidSequenceException if the input does not contain a valid sequence
132             */
133            public FactorSequence (Reader reader)
134                    throws IOException, InvalidSequenceException
135            {
136                    BufferedReader  input = new BufferedReader(reader);
137                    Trie                    root_node, current_node, new_node = null;
138                    Factor                  current_factor, last_factor, new_factor;
139                    int                             ch;
140                    char                    c;
141    
142                    // create root factor and the root node of the trie
143                    root_factor = new Factor ();
144                    root_node = new Trie (root_factor);
145                    num_factors = 1;
146                    num_chars = 0;
147    
148                    current_node = root_node;
149                    last_factor = root_factor;
150    
151                    // read characters from the input
152                    while ((ch = input.read()) != -1)
153                    {
154                            c = (char) ch;
155    
156                            if (c == COMMENT_CHAR)
157                                    // it's a comment line: skip it!
158                                    input.readLine();
159    
160                            // accept letters only
161                            else if (Character.isLetter(c))
162                            {
163                                    num_chars++;
164    
165                                    // walk down the trie as far as possible
166                                    new_node = current_node.spellDown(c);
167    
168                                    if (new_node != null)
169                                    {
170                                            current_node = new_node;
171                                    }
172                                    else
173                                    {
174                                            // the longest factor of the input has been found,
175                                            // now create a new factor from the current node's factor
176                                            current_factor = (Factor) current_node.getData();
177                                            new_factor = new Factor (current_factor, num_factors, c);
178    
179                                            // add the new character to the trie as well
180                                            current_node.add (new_factor, c);
181    
182                                            // set up a pointer from the last factor to the new one
183                                            last_factor.setNext (new_factor);
184                                            last_factor = new_factor;
185    
186                                            // restart at the root of the trie
187                                            current_node = root_node;
188    
189                                            num_factors++;
190                                    }
191                            }
192    
193                            // anything else, except whitespaces, will throw an exception
194                            else if (!Character.isWhitespace(c))
195                                    throw new InvalidSequenceException
196                                            ("Sequences can contain letters only.");
197                    }
198    
199                    // if new_node is not null, the last factor is actually
200                    // not a new factor but a factor already created
201                    if (new_node != null)
202                    {
203                            // no new node is created, just point the last_factor to an
204                            // existing one that represents the last characters of the text
205                            last_factor.setNext((Factor) new_node.getData());
206    
207                            num_factors++;
208                    }
209    
210                    // check if read anything useful!
211                    if (num_factors <= 1)
212                            throw new InvalidSequenceException ("Empty sequence.");
213            }
214    
215            /**
216             * Returns the root factor, the one that starts the list of factors.
217             *
218             * @return root factor
219             */
220            public Factor getRootFactor ()
221            {
222                    return root_factor;
223            }
224    
225            /**
226             * Returns the number of factors produced by the LZ78 parsing of the text.
227             *
228             * @return number of factors
229             */
230            public int numFactors()
231            {
232                    return num_factors;
233            }
234    
235            /**
236             * Returns the number of characters of the original sequence.
237             *
238             * @return number of characters of the original sequence
239             */
240            public int numChars ()
241            {
242                    return num_chars;
243            }
244    
245            /**
246             * Reconstructs the sequence from the list of factors induced by the LZ78 parsing of
247             * the text.
248             *
249             * @return the original sequence
250             */
251            public String toString ()
252            {
253                    StringBuffer    buf = new StringBuffer();
254                    Factor                  node;
255    
256                    node = root_factor.getNext();
257    
258                    for (int i = 1; i < numFactors(); i++)
259                    {
260                            buf.append(node);
261    
262                            node = node.getNext();
263                    }
264    
265                    return buf.toString();
266            }
267    
268            /**
269             * Returns a string representation of the actual list of factors produced by the LZ78
270             * parsing of the text. Each factor is printed out in a separate line, in the order
271             * they appear in the text, with its serial number, its ancestor's serial number, its
272             * new character, length and a string representation of the factor itself.
273             *
274             * @return a string representation of the list of factors
275             */
276            public String printFactors ()
277            {
278                    StringBuffer    buf = new StringBuffer();
279                    Factor                  factor;
280    
281                    factor = root_factor.getNext();
282    
283                    for (int i = 1; i < numFactors(); i++)
284                    {
285                            buf.append (factor.getSerialNumber() + "\t<");
286                            buf.append (factor.getAncestor().getSerialNumber() + " ,\t");
287                            buf.append (factor.getNewChar() + ">\t");
288                            buf.append (factor.length() + "\t" + factor + "\n");
289    
290                            factor = factor.getNext();
291                    }
292    
293                    buf.append(numFactors() + " factors\n");
294    
295                    return buf.toString();
296            }
297    }